import datasets
from datasets import load_dataset,Audio
from transformers import Wav2Vec2ForCTC, AutoProcessor
import torch

model_id = "facebook/mms-1b-fl102"
# model_id = "facebook/mms-300m"
processor = AutoProcessor.from_pretrained(model_id)
model = Wav2Vec2ForCTC.from_pretrained(model_id)


# English
stream_data = load_dataset("mozilla-foundation/common_voice_13_0", "zh-CN", split="test", streaming=True,)
stream_data = stream_data.cast_column("audio", Audio(sampling_rate=16000))
en_sample = next(iter(stream_data))["audio"]["array"]

# French
# stream_data = load_dataset("mozilla-foundation/common_voice_13_0", "fr", split="test", streaming=True)
# stream_data = stream_data.cast_column("audio", Audio(sampling_rate=16000))
# fr_sample = next(iter(stream_data))["audio"]["array"]


if __name__ == "__main__":
    pass